*******************************
* Title: rwanda_jde_table7.do
* Author: Todd Pugatch
* Last update: 10 Nov 2020
* Description: analysis for Blimpo and Pugatch, "Entrepreneurship Education
*	and Teacher Training in Rwanda," Stage 2 Registered report, Journal of 
*	Development Economics
* Inputs: 	student_merge_jde.dta
* Outputs: 	rwanda_jde_table7.[txt/out]
* Notes: produces Table 7
********************************

* Set environment
local start=`"$S_TIME"'
clear
clear matrix
clear mata
graph drop _all
program drop _all
cap log close
set more off

* Set directories 
*global main "[SET MAIN DIRECTORY HERE]"
	global rawdata "$main/01_data/01_raw"
	global cleandata "$main/01_data/02_clean"
	global dofiles "$main/02_dofiles"
	global results "$main/03_results"
	global temp "$main/04_output"


* begin log file
log using "$temp/rwanda_jde_table7.txt", text replace

****************************
* SET UP MULTIPLE HYPOTHESIS CORRECTION
/*Proceed in 3 steps:
	1. Run all regressions for H1. Save coefficients & std. errors.
	2. Calculate q-values for all hypotheses.
	3. Re-run regressions to get output, including q-values.*/
****************************

* 1. GET COEF/SE/P FOR ALL REGRESSIONS	
* ANALYSIS
/*Goal: produce columns 1-10 of Table P6
	Uses endline student survey.
	Regress outcome on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).*/
qui use "$cleandata/student_merge_jde.dta", clear

/*get mean/sd of selected control group outcomes, per request of Della Vigna et al
	team at Berkeley gathering predictions on project outcomes*/
su dropout personal_business_el if treatment==0

/*DROPOUT*/
/*Column 1: dropout*/
* dropout appears in student endline survey (not head teacher survey, as originally written in JDE Reg. Report Stage 1)
* note that we don't know if those who attrited from survey have dropped out or not
* also, there is no relevant baseline outcome to include here
local i=1
qui areg dropout treatment, a(strata) cluster(school_code)
scalar define H4_c`i'b=_b[treatment]
scalar define H4_c`i'se=_se[treatment]
scalar define t=_b[treatment]/_se[treatment]
scalar define H4_c`i'p=2*ttail(e(df_r),abs(t))
local i=`i'+1

/*ECONOMIC ACTIVITY*/	
* winsorize all financial variables used in analysis
local fin_el "earn_last2mths_usd profits_last2mths_adj_usd earn_alt_adj_usd"
local fin_bl "earn_last2mths_usd business_inc_last2mths_usd"
foreach w in el bl {
	foreach x in `fin_`w'' {
		winsor `x'_`w', gen(`x'w_`w') p(.01) highonly
		lab var `x'w_`w' "`x'_`w', winsorized at 99th percentile"
	}
}

* prep missing values for students without baseline outcome
/*first, impute zero for (winsorized) business income for those who don't report owning a business 
	(original version conditioned on business ownership)*/
qui replace business_inc_last2mths_usdw_bl=0 if ownbusiness_bl==0

/*now impute control group mean to remaining missing values*/
local Y0 "ownbusiness_bl buspartners_school_bl buspartners_family_bl ownbusiness_nonag_bl earn_money_bl earn_last2mths_usdw_bl business_inc_last2mths_usdw_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

/*Columns 2-8: business participation, characteristics, employment*/
/*set up outcomes and control variables for loop*/
local Y "personal_business_el business_solo_el business_sbc_el business_famorpeers_el businesstype_nonag_el	business_employs_el	employed_el"
local y02 "ownbusiness_bl"
local y03 "ownbusiness_bl"
local y04 "buspartners_school_bl"
local y05 "buspartners_family_bl"
local y06 "ownbusiness_nonag_bl"
local y07 "ownbusiness_bl"
local y08 "earn_money_bl"

/*regressions: columns 2-8*/
local stat ""control mean",mu_c,"baseline mean",mu_0,"lower bound",lb,"upper bound",ub"
local i=2 /*initialize loop*/
foreach y in `Y' {
	qui areg `y' treatment `y0`i''i `y0`i''m, a(strata) cluster(school_code)
	scalar define H4_c`i'b=_b[treatment]
	scalar define H4_c`i'se=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define H4_c`i'p=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}

/*Columns 9-10: income and business profits*/
/*also use alternative measure for column 9, as robustness check*/
local Y "earn_last2mths_usdw_el	profits_last2mths_adj_usdw_el earn_alt_adj_usdw_el"
local y09 "earn_last2mths_usdw_bl"
local y010 "business_inc_last2mths_usdw_bl"
local y011 "earn_last2mths_usdw_bl"
local i=9 /*initialize loop*/
foreach y in `Y' {
	qui areg `y' treatment `y0`i''i `y0`i''m, a(strata) cluster(school_code)
	scalar define H4_c`i'b=_b[treatment]
	scalar define H4_c`i'se=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define H4_c`i'p=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}

* 2. CALCULATE Q-VALUES FOR ALL HYPOTHESES
/*Calculate sharpened q-values from Benjamini, Krieger, Yuketeli (2006).
	Use code from fdr_sharpened_qvalues.do from Anderson (2008), available at 
		https://are.berkeley.edu/~mlanderson/ARE_Website/Research.html*/
* prep data: put p-values from above into a dataset
clear
local n=`i'-1
qui set obs `n'
qui gen hypothesis=4
qui gen column=_n
foreach x in b se pval {
	qui gen double `x'=.
}
forval i=1/`n' {
	qui replace b=H4_c`i'b in `i'
	qui replace se=H4_c`i'se in `i'
	qui replace pval=H4_c`i'p in `i' 
}

* START OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
* Collect the total number of p-values tested
quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui gen bky06_qval = 1 if pval~=.
	
* Set the initial counter to 1 

local qval = 1

/* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, 
	then checks which hypotheses are rejected at q = 0.999, 
	then checks which hypotheses are rejected at q = 0.998, etc.  
	The loop ends by checking which hypotheses are rejected at q = 0.001. */
while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	
	* Generate value q'*r/M
	qui gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q'*r/M
	qui gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank1 = reject_temp1*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	
	* Generate value q_2st*r/M
	qui gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	qui gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank2 = reject_temp2*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	qui replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}

quietly sort original_sorting_order

******************************
* END OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
******************************
* list results
list hypothesis column b se pval bky06_qval

* prep results for inclusion in final regression output
forval i=1/`n' {
	scalar define H4_c`i'q=bky06_qval in `i'
}

* 3. RE-RUN REGRESSIONS TO GET OUTPUT, INCLUDING Q-VALUES
* ANALYSIS
/*Goal: produce columns 1-10 of Table P6
	Uses endline student survey.
	Regress outcome on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).
	Include sharpened q-values from above.*/

qui use "$cleandata/student_merge_jde.dta", clear

* define program for obtaining Lee bounds
program getleebounds
	qui predict e if e(sample), residuals
	qui leebounds e treatment, select(insample_el)
	scalar define lb=_b[lower]
	scalar define ub=_b[upper]
	drop e
end

/*DROPOUT*/
/*Column 1: dropout*/
* dropout appears in student endline survey (not head teacher survey, as originally written in JDE Reg. Report Stage 1)
* note that we don't know if those who attrited from survey have dropped out or not
* also, there is no relevant baseline outcome to include here
local stat ""control mean",mu_c,"q-value",qv,"lower bound",lb,"upper bound",ub"
local i=1
qui areg dropout, a(strata) cluster(school_code)
getleebounds

qui areg dropout treatment, a(strata) cluster(school_code)
qui su dropout if treatment==0 & e(sample)
scalar define mu_c=r(mean)
scalar define qv=H4_c`i'q
outreg2 using "$results/rwanda_jde_table7.xls", se excel nolabel nocons addstat(`stat') adec(3) replace

/*ECONOMIC ACTIVITY*/	
* winsorize all financial variables used in analysis
local fin_el "earn_last2mths_usd profits_last2mths_adj_usd earn_alt_adj_usd"
local fin_bl "earn_last2mths_usd business_inc_last2mths_usd"
foreach w in el bl {
	foreach x in `fin_`w'' {
		winsor `x'_`w', gen(`x'w_`w') p(.01) highonly
		lab var `x'w_`w' "`x'_`w', winsorized at 99th percentile"
	}
}

* prep missing values for students without baseline outcome
/*first, impute zero for (winsorized) business income for those who don't report owning a business 
	(original version conditioned on business ownership)*/
qui replace business_inc_last2mths_usdw_bl=0 if ownbusiness_bl==0

/*now impute control group mean to remaining missing values*/
local Y0 "ownbusiness_bl buspartners_school_bl buspartners_family_bl ownbusiness_nonag_bl earn_money_bl earn_last2mths_usdw_bl business_inc_last2mths_usdw_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

/*Columns 2-8: business participation, characteristics, employment*/
/*set up outcomes and control variables for loop*/
local Y "personal_business_el business_solo_el business_sbc_el business_famorpeers_el businesstype_nonag_el	business_employs_el	employed_el"
local y02 "ownbusiness_bl"
local y03 "ownbusiness_bl"
local y04 "buspartners_school_bl"
local y05 "buspartners_family_bl"
local y06 "ownbusiness_nonag_bl"
local y07 "ownbusiness_bl"
local y08 "earn_money_bl"

/*regressions: columns 2-8*/
local stat ""control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub"
local i=2 /*initialize loop*/
foreach y in `Y' {
	qui areg `y' `y0`i''i `y0`i''m, a(strata) cluster(school_code)
	getleebounds
	
	qui areg `y' treatment `y0`i''i `y0`i''m, a(strata) cluster(school_code)
	qui su `y' if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	qui su `y0`i'' if e(sample)
	scalar define mu_0=r(mean)
	scalar define qv=H4_c`i'q
	outreg2 using "$results/rwanda_jde_table7.xls", se excel nolabel nocons addstat(`stat') adec(3) append
	local i=`i'+1
}

/*Columns 9-10: income and business profits*/
/*also use alternative measure for column 9, as robustness check*/
local Y "earn_last2mths_usdw_el	profits_last2mths_adj_usdw_el earn_alt_adj_usdw_el"
local y09 "earn_last2mths_usdw_bl"
local y010 "business_inc_last2mths_usdw_bl"
local y011 "earn_last2mths_usdw_bl"
local i=9 /*initialize loop*/
foreach y in `Y' {
	qui areg `y' `y0`i''i `y0`i''m, a(strata) cluster(school_code)
	getleebounds
	
	qui areg `y' treatment `y0`i''i `y0`i''m, a(strata) cluster(school_code)
	qui su `y' if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	qui su `y0`i'' if e(sample)
	scalar define mu_0=r(mean)
	scalar define qv=H4_c`i'q
	outreg2 using "$results/rwanda_jde_table7.xls", se excel nolabel nocons addstat(`stat') adec(3) append
	local i=`i'+1
}



local end=`"$S_TIME"' 
di "`start'"
di "`end'"
log close
